{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "regression_crime.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sK-RhlsGxwpd",
        "colab_type": "text"
      },
      "source": [
        "##### Copyright 2020 Google LLC.\n",
        "\n",
        "\n",
        "Licensed under the Apache License, Version 2.0 (the 'License');\n",
        "you may not use this file except in compliance with the License.\n",
        "You may obtain a copy of the License at\n",
        "\n",
        "    https://www.apache.org/licenses/LICENSE-2.0\n",
        "\n",
        "Unless required by applicable law or agreed to in writing, software\n",
        "distributed under the License is distributed on an 'AS IS' BASIS,\n",
        "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "See the License for the specific language governing permissions and\n",
        "limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fuN9k6Gux-yD",
        "colab_type": "text"
      },
      "source": [
        "This colab contains TensorFlow code for implementing the constrained optimization methods presented in the paper:\n",
        "> Harikrishna Narasimhan, Andrew Cotter, Maya Gupta, Serena Wang, 'Pairwise Fairness for Ranking and Regression', AAAI 2020. [<a href='https://arxiv.org/pdf/1906.05330.pdf'>link</a>]\n",
        "\n",
        "First, let's install and import the relevant libraries."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JXgLyAJm0UyB",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import random\n",
        "import sys\n",
        "from sklearn import model_selection\n",
        "import tensorflow as tf"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DvhGP5TW0V_J",
        "colab_type": "code",
        "outputId": "e58bb5f0-7d86-4edf-dcae-bf0f5ba0aa5a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 872
        }
      },
      "source": [
        "!pip install git+https://github.com/google-research/tensorflow_constrained_optimization"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Collecting git+https://github.com/google-research/tensorflow_constrained_optimization\n",
            "  Cloning https://github.com/google-research/tensorflow_constrained_optimization to /tmp/pip-req-build-qdt4wk1d\n",
            "  Running command git clone -q https://github.com/google-research/tensorflow_constrained_optimization /tmp/pip-req-build-qdt4wk1d\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from tfco-nightly==0.3.dev20200613) (1.18.5)\n",
            "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from tfco-nightly==0.3.dev20200613) (1.4.1)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from tfco-nightly==0.3.dev20200613) (1.12.0)\n",
            "Requirement already satisfied: tensorflow>=1.14 in /usr/local/lib/python3.6/dist-packages (from tfco-nightly==0.3.dev20200613) (2.2.0)\n",
            "Requirement already satisfied: tensorflow-estimator<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (2.2.0)\n",
            "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.9.0)\n",
            "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.2.0)\n",
            "Requirement already satisfied: keras-preprocessing>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.1.2)\n",
            "Requirement already satisfied: tensorboard<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (2.2.2)\n",
            "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.10.0)\n",
            "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.2.1)\n",
            "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.1.0)\n",
            "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.29.0)\n",
            "Requirement already satisfied: wheel>=0.26; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.34.2)\n",
            "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (2.10.0)\n",
            "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.12.1)\n",
            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.6.3)\n",
            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.3.3)\n",
            "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.6.0.post3)\n",
            "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (2.23.0)\n",
            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.2.2)\n",
            "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (47.1.1)\n",
            "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.0.1)\n",
            "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.7.2)\n",
            "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.4.1)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (2.9)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.24.3)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (2020.4.5.1)\n",
            "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.6.0)\n",
            "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (4.0)\n",
            "Requirement already satisfied: cachetools<3.2,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.1.1)\n",
            "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.2.8)\n",
            "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (1.3.0)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.1.0)\n",
            "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (0.4.8)\n",
            "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow>=1.14->tfco-nightly==0.3.dev20200613) (3.1.0)\n",
            "Building wheels for collected packages: tfco-nightly\n",
            "  Building wheel for tfco-nightly (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for tfco-nightly: filename=tfco_nightly-0.3.dev20200613-cp36-none-any.whl size=148305 sha256=20125b69ff217dcf5a0dab72f9de0760456a7319ba6837b2dcbef6344aa52526\n",
            "  Stored in directory: /tmp/pip-ephem-wheel-cache-syjh4pkj/wheels/c9/b3/c3/78e0691949466af462380554286105216cd95a9ae7cf08ee78\n",
            "Successfully built tfco-nightly\n",
            "Installing collected packages: tfco-nightly\n",
            "Successfully installed tfco-nightly-0.3.dev20200613\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XGFoSFuX0XJc",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import tensorflow_constrained_optimization as tfco"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gBUr48pLzsqK",
        "colab_type": "text"
      },
      "source": [
        "## Pairwise Regression Fairness"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_Wgq7N73rPHV",
        "colab_type": "text"
      },
      "source": [
        "We will be training a linear scoring function $f(x) = w^\\top x$ where $x \\in \\mathbb{R}^d$ is the input feature vector. Our goal is to train the regression model subject to pairwise fairness constraints.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "6tZrx9BfOB_Q",
        "colab_type": "text"
      },
      "source": [
        "Specifically, for the regression model $f$, we denote:\n",
        "- $sqerr(f)$ as the squared error for model $f$.\n",
        "$$\n",
        "sqerr(f) = \\mathbf{E}\\big[\\big(f(x) - y\\big)^2\\big]\n",
        "$$\n",
        "\n",
        "\n",
        "- $err_{i,j}(f)$ as the pairwise error over example pairs where the higher label example is from group $i$, and the lower label example is from group $j$.\n",
        "\n",
        "$$\n",
        "err_{i, j}(f) = \\mathbf{E}\\big[\\mathbb{I}\\big(f(x) < f(x')\\big) \\,\\big|\\, y > y',~ grp(x) = i, ~grp(x') = j\\big]\n",
        "$$\n",
        "<br>\n",
        "\n",
        "We then wish to solve the following constrained problem:\n",
        "$$min_f\\; sqerr(f)$$\n",
        "$$\\text{   s.t.   } |err_{i,j}(f) - err_{k,\\ell}(f)| \\leq \\epsilon \\;\\;\\; \\forall ((i,j), (k,\\ell)) \\in \\mathcal{G},$$\n",
        "\n",
        "where $\\mathcal{G}$ contains the pairs we are interested in constraining."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qM-PzAuykOmN",
        "colab_type": "text"
      },
      "source": [
        "## Load Communities & Crime Data\n",
        "\n",
        "We will use the benchmark Communities and Crimes dataset from the UCI Machine Learning repository for our illustration. This dataset contains various demographic and racial distribution details (aggregated from census and law enforcement data sources) about different communities in the US, along with the per capita crime rate in each commmunity. Our goal is to predict the crime rate for a community, a *regression* problem. We consider communities where the percentage of black population is above the 70-th percentile as the protected group."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QSUkaGKxBa2M",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# We will divide the data into 25 minibatches and refer to them as 'queries'.\n",
        "num_queries = 25\n",
        "\n",
        "# List of column names in the dataset.\n",
        "column_names = [\"state\", \"county\", \"community\", \"communityname\", \"fold\", \"population\", \"householdsize\", \"racepctblack\", \"racePctWhite\", \"racePctAsian\", \"racePctHisp\", \"agePct12t21\", \"agePct12t29\", \"agePct16t24\", \"agePct65up\", \"numbUrban\", \"pctUrban\", \"medIncome\", \"pctWWage\", \"pctWFarmSelf\", \"pctWInvInc\", \"pctWSocSec\", \"pctWPubAsst\", \"pctWRetire\", \"medFamInc\", \"perCapInc\", \"whitePerCap\", \"blackPerCap\", \"indianPerCap\", \"AsianPerCap\", \"OtherPerCap\", \"HispPerCap\", \"NumUnderPov\", \"PctPopUnderPov\", \"PctLess9thGrade\", \"PctNotHSGrad\", \"PctBSorMore\", \"PctUnemployed\", \"PctEmploy\", \"PctEmplManu\", \"PctEmplProfServ\", \"PctOccupManu\", \"PctOccupMgmtProf\", \"MalePctDivorce\", \"MalePctNevMarr\", \"FemalePctDiv\", \"TotalPctDiv\", \"PersPerFam\", \"PctFam2Par\", \"PctKids2Par\", \"PctYoungKids2Par\", \"PctTeen2Par\", \"PctWorkMomYoungKids\", \"PctWorkMom\", \"NumIlleg\", \"PctIlleg\", \"NumImmig\", \"PctImmigRecent\", \"PctImmigRec5\", \"PctImmigRec8\", \"PctImmigRec10\", \"PctRecentImmig\", \"PctRecImmig5\", \"PctRecImmig8\", \"PctRecImmig10\", \"PctSpeakEnglOnly\", \"PctNotSpeakEnglWell\", \"PctLargHouseFam\", \"PctLargHouseOccup\", \"PersPerOccupHous\", \"PersPerOwnOccHous\", \"PersPerRentOccHous\", \"PctPersOwnOccup\", \"PctPersDenseHous\", \"PctHousLess3BR\", \"MedNumBR\", \"HousVacant\", \"PctHousOccup\", \"PctHousOwnOcc\", \"PctVacantBoarded\", \"PctVacMore6Mos\", \"MedYrHousBuilt\", \"PctHousNoPhone\", \"PctWOFullPlumb\", \"OwnOccLowQuart\", \"OwnOccMedVal\", \"OwnOccHiQuart\", \"RentLowQ\", \"RentMedian\", \"RentHighQ\", \"MedRent\", \"MedRentPctHousInc\", \"MedOwnCostPctInc\", \"MedOwnCostPctIncNoMtg\", \"NumInShelters\", \"NumStreet\", \"PctForeignBorn\", \"PctBornSameState\", \"PctSameHouse85\", \"PctSameCity85\", \"PctSameState85\", \"LemasSwornFT\", \"LemasSwFTPerPop\", \"LemasSwFTFieldOps\", \"LemasSwFTFieldPerPop\", \"LemasTotalReq\", \"LemasTotReqPerPop\", \"PolicReqPerOffic\", \"PolicPerPop\", \"RacialMatchCommPol\", \"PctPolicWhite\", \"PctPolicBlack\", \"PctPolicHisp\", \"PctPolicAsian\", \"PctPolicMinor\", \"OfficAssgnDrugUnits\", \"NumKindsDrugsSeiz\", \"PolicAveOTWorked\", \"LandArea\", \"PopDens\", \"PctUsePubTrans\", \"PolicCars\", \"PolicOperBudg\", \"LemasPctPolicOnPatr\", \"LemasGangUnitDeploy\", \"LemasPctOfficDrugUn\", \"PolicBudgPerPop\", \"ViolentCrimesPerPop\"]\n",
        "\n",
        "dataset_url = \"http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data\"\n",
        "\n",
        "# Read dataset from the UCI web repository and assign column names.\n",
        "data_df = pd.read_csv(dataset_url, sep=\",\", names=column_names,\n",
        "                      na_values=\"?\")\n",
        "\n",
        "# Make sure that there are no missing values in the \"ViolentCrimesPerPop\" column.\n",
        "assert(not data_df[\"ViolentCrimesPerPop\"].isna().any())\n",
        "\n",
        "# Real-valued label: \"ViolentCrimesPerPop\".\n",
        "labels_df = data_df[\"ViolentCrimesPerPop\"]\n",
        "\n",
        "# Now that we have assigned binary labels, \n",
        "# we drop the \"ViolentCrimesPerPop\" column from the data frame.\n",
        "data_df.drop(columns=\"ViolentCrimesPerPop\", inplace=True)\n",
        "\n",
        "# Group features.\n",
        "race_black_70_percentile = data_df[\"racepctblack\"].quantile(q=0.7)\n",
        "groups_df = (data_df[\"racepctblack\"] >= race_black_70_percentile)\n",
        "\n",
        "# Drop categorical features.\n",
        "data_df.drop(columns=[\"state\", \"county\", \"community\", \"communityname\", \"fold\"],\n",
        "             inplace=True)\n",
        "\n",
        "# Handle missing features.\n",
        "feature_names = data_df.columns\n",
        "for feature_name in feature_names:  \n",
        "    missing_rows = data_df[feature_name].isna()  # Which rows have missing values?\n",
        "    if missing_rows.any():  # Check if at least one row has a missing value.\n",
        "        data_df[feature_name].fillna(0.0, inplace=True)  # Fill NaN with 0.\n",
        "        missing_rows.rename(feature_name + \"_is_missing\", inplace=True)\n",
        "        data_df = data_df.join(missing_rows)  # Append boolean \"is_missing\" feature.\n",
        "\n",
        "labels = labels_df.values.astype(np.float32)\n",
        "groups = groups_df.values.astype(np.float32)\n",
        "features = data_df.values.astype(np.float32)\n",
        "\n",
        "# Set random seed so that the results are reproducible.\n",
        "np.random.seed(123456)\n",
        "\n",
        "# We randomly divide the examples into 'num_queries' queries.\n",
        "queries = np.random.randint(0, num_queries, size=features.shape[0])\n",
        "\n",
        "# Train and test indices.\n",
        "train_indices, test_indices = model_selection.train_test_split(\n",
        "    range(features.shape[0]), test_size=0.4)\n",
        "\n",
        "# Train features, labels and protected groups.\n",
        "train_set = {\n",
        "  'features': features[train_indices, :],\n",
        "  'labels': labels[train_indices],\n",
        "  'groups': groups[train_indices],\n",
        "  'queries': queries[train_indices],\n",
        "  'dimension': features.shape[-1],\n",
        "  'num_queries': num_queries\n",
        "}\n",
        "\n",
        "# Test features, labels and protected groups.\n",
        "test_set = {\n",
        "  'features': features[test_indices, :],\n",
        "  'labels': labels[test_indices],\n",
        "  'groups': groups[test_indices],\n",
        "  'queries': queries[test_indices],\n",
        "  'dimension': features.shape[-1],\n",
        "  'num_queries': num_queries\n",
        "}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JxBMPRJA2wvW",
        "colab_type": "text"
      },
      "source": [
        "## Evaluation Metrics"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "q7WpTPkKAga-",
        "colab_type": "text"
      },
      "source": [
        "We will need functions to convert labeled data into paired data."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "u0zUW2wEYMes",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def pair_high_low_docs(data):\n",
        "  # Returns a DataFrame of pairs of larger-smaller labeled regression examples\n",
        "  # given in DataFrame.\n",
        "  # For all pairs of docs, and remove rows that are not needed.\n",
        "  pos_docs = data.copy()\n",
        "  neg_docs = data.copy()\n",
        "\n",
        "  # Include a merge key.\n",
        "  pos_docs.insert(0, \"merge_key\", 0)\n",
        "  neg_docs.insert(0, \"merge_key\", 0)\n",
        "\n",
        "  # Merge docs and drop merge key and label column.\n",
        "  pairs = pos_docs.merge(neg_docs, on=\"merge_key\", how=\"outer\",\n",
        "                         suffixes=(\"_pos\", \"_neg\"))\n",
        "\n",
        "  # Only retain rows where label_pos > label_neg.\n",
        "  pairs = pairs[pairs.label_pos > pairs.label_neg]\n",
        "\n",
        "  # Drop merge_key.\n",
        "  pairs.drop(columns=[\"merge_key\"], inplace=True)\n",
        "  return pairs\n",
        "\n",
        "\n",
        "def convert_labeled_to_paired_data(data_dict, index=None):\n",
        "  # Forms pairs of examples from each batch/query.\n",
        "\n",
        "  # Converts data arrays to pandas DataFrame with required column names and\n",
        "  # makes a call to convert_df_to_pairs and returns a dictionary.\n",
        "  features = data_dict['features']\n",
        "  labels = data_dict['labels']\n",
        "  groups = data_dict['groups']\n",
        "  queries = data_dict['queries']\n",
        "\n",
        "  if index is not None:\n",
        "    data_df = pd.DataFrame(features[queries == index, :])\n",
        "    data_df = data_df.assign(label=pd.DataFrame(labels[queries == index]))\n",
        "    data_df = data_df.assign(group=pd.DataFrame(groups[queries == index]))\n",
        "    data_df = data_df.assign(query_id=pd.DataFrame(queries[queries == index]))\n",
        "  else:\n",
        "    data_df = pd.DataFrame(features)\n",
        "    data_df = data_df.assign(label=pd.DataFrame(labels))\n",
        "    data_df = data_df.assign(group=pd.DataFrame(groups))\n",
        "    data_df = data_df.assign(query_id=pd.DataFrame(queries))\n",
        "\n",
        "  # Forms pairs of positive-negative docs for each query in given DataFrame\n",
        "  # if the DataFrame has a query_id column. Otherise forms pairs from all rows\n",
        "  # of the DataFrame.\n",
        "  data_pairs = data_df.groupby('query_id').apply(pair_high_low_docs)\n",
        "\n",
        "  # Create groups ndarray.\n",
        "  pos_groups = data_pairs['group_pos'].values.reshape(-1, 1)\n",
        "  neg_groups = data_pairs['group_neg'].values.reshape(-1, 1)\n",
        "  group_pairs = np.concatenate((pos_groups, neg_groups), axis=1)\n",
        "\n",
        "  # Create queries ndarray.\n",
        "  query_pairs = data_pairs['query_id_pos'].values.reshape(-1,)\n",
        "\n",
        "  # Create features ndarray.\n",
        "  feature_names = data_df.columns\n",
        "  feature_names = feature_names.drop(['query_id', 'label'])\n",
        "  feature_names = feature_names.drop(['group'])\n",
        "\n",
        "  pos_features = data_pairs[[str(s) + '_pos' for s in feature_names]].values\n",
        "  pos_features = pos_features.reshape(-1, 1, len(feature_names))\n",
        "\n",
        "  neg_features = data_pairs[[str(s) + '_neg' for s in feature_names]].values\n",
        "  neg_features = neg_features.reshape(-1, 1, len(feature_names))\n",
        "\n",
        "  feature_pairs = np.concatenate((pos_features, neg_features), axis=1)\n",
        "\n",
        "  # Paired data dict.\n",
        "  paired_data = {\n",
        "      'feature_pairs': feature_pairs, \n",
        "      'group_pairs': group_pairs, \n",
        "      'query_pairs': query_pairs,\n",
        "      'features': features,\n",
        "      'labels': labels,\n",
        "      'queries': queries,\n",
        "      'dimension': data_dict['dimension'],\n",
        "      'num_queries': data_dict['num_queries']\n",
        "  }\n",
        "\n",
        "  return paired_data"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "H4HV7a7wq7rm",
        "colab_type": "text"
      },
      "source": [
        "We will also need functions to evaluate the pairwise error rates for a linear model."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K8OQ4ado20p-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def get_mask(groups, pos_group, neg_group=None):\n",
        "  # Returns a boolean mask selecting positive-negative document pairs where \n",
        "  # the protected group for  the positive document is pos_group and \n",
        "  # the protected group for the negative document (if specified) is neg_group.\n",
        "  # Repeat group membership positive docs as many times as negative docs.\n",
        "  mask_pos = groups[:, 0] == pos_group\n",
        "  \n",
        "  if neg_group is None:\n",
        "    return mask_pos\n",
        "  else:\n",
        "    mask_neg = groups[:, 1] == neg_group\n",
        "    return mask_pos & mask_neg\n",
        "\n",
        "\n",
        "def mean_squared_error(model, dataset):\n",
        "  # Returns mean squared error for Keras model on dataset.\n",
        "  scores = model.predict(dataset['features'])\n",
        "  labels = dataset['labels']\n",
        "  return np.mean((scores - labels) ** 2)\n",
        "\n",
        "\n",
        "def group_error_rate(model, dataset, pos_group, neg_group=None):\n",
        "  # Returns error rate for Keras model on data set, considering only document \n",
        "  # pairs where the protected group for the positive document is pos_group, and  \n",
        "  # the protected group for the negative document (if specified) is neg_group.\n",
        "  d = dataset['dimension']\n",
        "  scores0 = model.predict(dataset['feature_pairs'][:, 0, :].reshape(-1, d))\n",
        "  scores1 = model.predict(dataset['feature_pairs'][:, 1, :].reshape(-1, d))\n",
        "  mask = get_mask(dataset['group_pairs'], pos_group, neg_group)\n",
        "  diff = scores0 - scores1\n",
        "  diff = diff[mask > 0].reshape((-1))\n",
        "  return np.mean(diff < 0)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kI8xNJDcpQYP",
        "colab_type": "text"
      },
      "source": [
        "## Create Linear Model\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lY4hvJAOra6s",
        "colab_type": "text"
      },
      "source": [
        "We then write a function to create the linear scoring model."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eTQOebAepXSu",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def create_scoring_model(feature_pairs, features, dimension):\n",
        "  # Returns a linear Keras scoring model, and returns a nullary function \n",
        "  # returning predictions on the features.\n",
        "\n",
        "  # Linear scoring model with no hidden layers.\n",
        "  layers = []\n",
        "  # Input layer takes `dimension` inputs.\n",
        "  layers.append(tf.keras.Input(shape=(dimension,)))\n",
        "  layers.append(tf.keras.layers.Dense(1)) \n",
        "  scoring_model = tf.keras.Sequential(layers)\n",
        "\n",
        "  # Create a nullary function that returns applies the linear model to the \n",
        "  # features and returns the tensor with the prediction differences on pairs.\n",
        "  def prediction_diffs():\n",
        "    scores0 = scoring_model(feature_pairs()[:, 0, :].reshape(-1, dimension))\n",
        "    scores1 = scoring_model(feature_pairs()[:, 1, :].reshape(-1, dimension))\n",
        "    return scores0 - scores1\n",
        "      \n",
        "  # Create a nullary function that returns the predictions on individual \n",
        "  # examples.\n",
        "  predictions = lambda: scoring_model(features())\n",
        "\n",
        "  return scoring_model, prediction_diffs, predictions"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WIBvG3Arv7zR",
        "colab_type": "text"
      },
      "source": [
        "## Formulate Optimization Problem"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SfZd-XPt0A8E",
        "colab_type": "text"
      },
      "source": [
        "We are ready to formulate the constrained optimization problem using the TFCO library. "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0AfVknixv9So",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def group_mask_fn(groups, pos_group, neg_group=None):\n",
        "  # Returns a nullary function returning group mask.\n",
        "  group_mask = lambda: np.reshape(\n",
        "      get_mask(groups(), pos_group, neg_group), (-1))\n",
        "  return group_mask\n",
        "\n",
        "\n",
        "def formulate_problem(\n",
        "    feature_pairs, group_pairs, features, labels, dimension, \n",
        "    constraint_groups=[], constraint_slack=None):\n",
        "  # Formulates a constrained problem that optimizes the squared error for a linear\n",
        "  # model on the specified dataset, subject to pairwise fairness constraints \n",
        "  # specified by the constraint_groups and the constraint_slack.\n",
        "  # \n",
        "  # Args:\n",
        "  #   feature_pairs: Nullary function returning paired features\n",
        "  #   group_pairs: Nullary function returning paired groups\n",
        "  #   features: Nullary function returning features\n",
        "  #   labels: Nullary function returning labels\n",
        "  #   dimension: Input dimension for scoring model\n",
        "  #   constraint_groups: List containing tuples of the form \n",
        "  #     ((pos_group0, neg_group0), (pos_group1, neg_group1)), specifying the \n",
        "  #     group memberships for the document pairs to compare in the constraints.\n",
        "  #   constraint_slack: slackness '\\epsilon' allowed in the constraints.\n",
        "  # Returns:\n",
        "  #   A RateMinimizationProblem object, and a Keras scoring model.\n",
        "\n",
        "  # Create linear scoring model: we get back a Keras model and a nullary  \n",
        "  # function returning predictions on the features.\n",
        "  scoring_model, prediction_diffs, predictions = create_scoring_model(\n",
        "      feature_pairs, features, dimension)\n",
        "  \n",
        "  # Context for the optimization objective.\n",
        "  context = tfco.rate_context(prediction_diffs)\n",
        "\n",
        "  # Squared loss objective.\n",
        "  squared_loss = lambda: tf.reduce_mean((predictions() - labels()) ** 2)\n",
        "  \n",
        "  # Constraint set.\n",
        "  constraint_set = []\n",
        "  \n",
        "  # Context for the constraints.\n",
        "  for ((pos_group0, neg_group0), (pos_group1, neg_group1)) in constraint_groups:\n",
        "    # Context for group 0.\n",
        "    group_mask0 = group_mask_fn(group_pairs, pos_group0, neg_group0)\n",
        "    context_group0 = context.subset(group_mask0)\n",
        "\n",
        "    # Context for group 1.\n",
        "    group_mask1 = group_mask_fn(group_pairs, pos_group1, neg_group1)\n",
        "    context_group1 = context.subset(group_mask1)\n",
        "\n",
        "    # Add constraints to constraint set.\n",
        "    constraint_set.append(\n",
        "        tfco.negative_prediction_rate(context_group0) <= (\n",
        "            tfco.negative_prediction_rate(context_group1) + constraint_slack))\n",
        "    constraint_set.append(\n",
        "        tfco.negative_prediction_rate(context_group1) <= (\n",
        "            tfco.negative_prediction_rate(context_group0) + constraint_slack))\n",
        "  \n",
        "  # Formulate constrained minimization problem.\n",
        "  problem = tfco.RateMinimizationProblem(\n",
        "      tfco.wrap_rate(squared_loss), constraint_set)\n",
        "  \n",
        "  return problem, scoring_model"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "P1x4yEllRKjH",
        "colab_type": "text"
      },
      "source": [
        "## Train Model"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "16nddoPIrmuj",
        "colab_type": "text"
      },
      "source": [
        "The following function then trains the linear model by solving the above constrained optimization problem. We first provide a training function with minibatch gradient updates. There are three types of pairwise fairness criterion we handle (specified by 'constraint_type'), and assign the (pos_group, neg_group) pairs to compare accordingly."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Md5pDHyBRN83",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def train_model(train_set, params):\n",
        "  # Trains the model with stochastic updates (one query per updates).\n",
        "  #\n",
        "  # Args:\n",
        "  #   train_set: Dictionary of \"paired\" training data.\n",
        "  #   params: Dictionary of hyper-paramters for training.\n",
        "  #\n",
        "  # Returns:\n",
        "  #   Trained model, list of objectives, list of group constraint violations.\n",
        "\n",
        "  # Set random seed for reproducibility.\n",
        "  random.seed(333333)\n",
        "  np.random.seed(121212)\n",
        "  tf.random.set_seed(212121)\n",
        "\n",
        "  # Set up problem and model.\n",
        "  if params['constrained']:\n",
        "    # Constrained optimization.\n",
        "    if params['constraint_type'] == 'marginal_equal_opportunity':\n",
        "      constraint_groups = [((0, None), (1, None))]\n",
        "    elif params['constraint_type'] == 'cross_group_equal_opportunity':\n",
        "      constraint_groups = [((0, 1), (1, 0))]\n",
        "    else:\n",
        "      constraint_groups = [((0, 1), (1, 0)), ((0, 0), (1, 1))]\n",
        "  else:\n",
        "    # Unconstrained optimization.\n",
        "    constraint_groups = []\n",
        "\n",
        "  # Dictionary that will hold batch features pairs, group pairs and labels for \n",
        "  # current batch. We include one query per-batch. \n",
        "  paired_batch = {}\n",
        "  batch_index = 0  # Index of current query.\n",
        "\n",
        "  # Data functions.\n",
        "  feature_pairs = lambda: paired_batch['feature_pairs']\n",
        "  group_pairs = lambda: paired_batch['group_pairs'] \n",
        "  features = lambda: paired_batch['features'] \n",
        "  labels = lambda: paired_batch['labels'] \n",
        "\n",
        "  # Create scoring model and constrained optimization problem.\n",
        "  problem, scoring_model = formulate_problem(\n",
        "      feature_pairs, group_pairs, features, labels, train_set['dimension'],\n",
        "      constraint_groups, params['constraint_slack'])\n",
        "  \n",
        "  # Create a loss function for the problem.\n",
        "  lagrangian_loss, update_ops, multipliers_variables = (\n",
        "      tfco.create_lagrangian_loss(problem, dual_scale=params['dual_scale']))\n",
        "\n",
        "  # Create optimizer\n",
        "  optimizer = tf.keras.optimizers.Adagrad(learning_rate=params['learning_rate'])\n",
        "  \n",
        "  # List of trainable variables.\n",
        "  var_list = (\n",
        "      scoring_model.trainable_weights + problem.trainable_variables + \n",
        "      [multipliers_variables])\n",
        "  \n",
        "  # List of objectives, group constraint violations.\n",
        "  # violations, and snapshot of models during course of training.\n",
        "  objectives = []\n",
        "  group_violations = []\n",
        "  models = []\n",
        "\n",
        "  feature_pair_batches = train_set['feature_pairs']\n",
        "  group_pair_batches = train_set['group_pairs']\n",
        "  query_pairs = train_set['query_pairs']  \n",
        "  feature_batches = train_set['features']\n",
        "  label_batches = train_set['labels']\n",
        "  queries = train_set['queries']  \n",
        "\n",
        "  print()\n",
        "  # Run loops * iterations_per_loop full batch iterations.\n",
        "  for ii in range(params['loops']):\n",
        "    for jj in range(params['iterations_per_loop']):\n",
        "      # Populate paired_batch dict with all pairs for current query. The batch\n",
        "      # index is the same as the current query index.\n",
        "      paired_batch = {\n",
        "          'feature_pairs': feature_pair_batches[query_pairs == batch_index],\n",
        "          'group_pairs': group_pair_batches[query_pairs == batch_index],\n",
        "          'features': feature_batches[queries == batch_index],\n",
        "          'labels': label_batches[queries == batch_index]\n",
        "      }\n",
        "\n",
        "      # Optimize loss.\n",
        "      update_ops()\n",
        "      optimizer.minimize(lagrangian_loss, var_list=var_list)\n",
        "\n",
        "      # Update batch_index, and cycle back once last query is reached.\n",
        "      batch_index = (batch_index + 1) % train_set['num_queries']\n",
        "    \n",
        "    # Snap shot current model.\n",
        "    model_copy = tf.keras.models.clone_model(scoring_model)\n",
        "    model_copy.set_weights(scoring_model.get_weights())\n",
        "    models.append(model_copy)\n",
        "\n",
        "    # Evaluate metrics for snapshotted model. \n",
        "    error, gerr, group_viol = evaluate_results(\n",
        "        scoring_model, train_set, params)\n",
        "    objectives.append(error)\n",
        "    group_violations.append(\n",
        "        [x - params['constraint_slack'] for x in group_viol])\n",
        "\n",
        "    sys.stdout.write(\n",
        "        '\\r Loop %d: error = %.3f, max constraint violation = %.3f' % \n",
        "        (ii, objectives[-1], max(group_violations[-1])))\n",
        "  print()\n",
        "  \n",
        "  if params['constrained']:\n",
        "    # Find model iterate that trades-off between objective and group violations.\n",
        "    best_index = tfco.find_best_candidate_index(\n",
        "        np.array(objectives), np.array(group_violations), rank_objectives=False)\n",
        "  else:\n",
        "    # Find model iterate that achieves lowest objective.\n",
        "    best_index = np.argmin(objectives)\n",
        "\n",
        "  return models[best_index]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WxFJV0tvKvyR",
        "colab_type": "text"
      },
      "source": [
        "## Summarize and Plot Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i7In7Ra7M_S7",
        "colab_type": "text"
      },
      "source": [
        "Having trained a model, we will need functions to summarize the various evaluation metrics."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CBl5KfEOPApl",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def evaluate_results(model, test_set, params):\n",
        "  # Returns sqaured error, group error rates, group-level constraint violations.\n",
        "  if params['constraint_type'] == 'marginal_equal_opportunity':\n",
        "    g0_error = group_error_rate(model, test_set, 0)\n",
        "    g1_error = group_error_rate(model, test_set, 1)\n",
        "    group_violations = [g0_error - g1_error, g1_error - g0_error]\n",
        "    return (mean_squared_error(model, test_set), [g0_error, g1_error], \n",
        "            group_violations)\n",
        "  else:\n",
        "    g00_error = group_error_rate(model, test_set, 0, 0)\n",
        "    g01_error = group_error_rate(model, test_set, 0, 1)\n",
        "    g10_error = group_error_rate(model, test_set, 1, 1)\n",
        "    g11_error = group_error_rate(model, test_set, 1, 1)\n",
        "    group_violations_offdiag = [g01_error - g10_error, g10_error - g01_error]\n",
        "    group_violations_diag = [g00_error - g11_error, g11_error - g00_error]\n",
        "\n",
        "    if params['constraint_type'] == 'cross_group_equal_opportunity':\n",
        "      return (mean_squared_error(model, test_set), \n",
        "              [[g00_error, g01_error], [g10_error, g11_error]], \n",
        "              group_violations_offdiag)\n",
        "    else:\n",
        "      return (mean_squared_error(model, test_set), \n",
        "              [[g00_error, g01_error], [g10_error, g11_error]], \n",
        "              group_violations_offdiag + group_violations_diag)\n",
        "    \n",
        "\n",
        "def display_results(\n",
        "    model, test_set, params, method, error_type, show_header=False):\n",
        "  # Prints evaluation results for model on test data.\n",
        "  error, group_error, diffs = evaluate_results(model, test_set, params)\n",
        "\n",
        "  if params['constraint_type'] == 'marginal_equal_opportunity':\n",
        "    if show_header:\n",
        "      print('\\nMethod\\t\\t\\tError\\t\\tMSE\\t\\tGroup 0\\t\\tGroup 1\\t\\tDiff')\n",
        "    print('%s\\t%s\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f' % (\n",
        "        method, error_type, error, group_error[0], group_error[1], \n",
        "        np.max(diffs)))\n",
        "  elif params['constraint_type'] == 'cross_group_equal_opportunity':\n",
        "    if show_header:\n",
        "      print('\\nMethod\\t\\t\\tError\\t\\tMSE\\t\\tGroup 0/1\\tGroup 1/0\\tDiff')\n",
        "    print('%s\\t%s\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f' % (\n",
        "        method, error_type, error, group_error[0][1], group_error[1][0], \n",
        "        np.max(diffs)))\n",
        "  else:\n",
        "    if show_header:\n",
        "      print('\\nMethod\\t\\t\\tError\\t\\MSE\\t\\tGroup 0/1\\tGroup 1/0\\t' +\n",
        "            'Group 0/0\\tGroup 1/1\\tDiff')\n",
        "    print('%s\\t%s\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f\\t\\t%.3f' % (\n",
        "        method, error_type, error, group_error[0][1], group_error[1][0], \n",
        "        group_error[0][0], group_error[1][1], np.max(diffs)))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PQR0nnORRedG",
        "colab_type": "text"
      },
      "source": [
        "# Experimental Results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jTYOW_EOsrWV",
        "colab_type": "text"
      },
      "source": [
        "We now run experiments with two types of pairwise fairness criteria: (1) marginal_equal_opportunity and (2) pairwise equal opportunity. In each case, we compare an unconstrained model trained to optimize just the squared error and a constrained model trained with pairwise fairness constraints.\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2h3v8OQUzh7-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Convert train/test set to paired data for later evaluation.\n",
        "paired_train_set = convert_labeled_to_paired_data(train_set)\n",
        "paired_test_set = convert_labeled_to_paired_data(test_set)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jqxzaPTEwEIn",
        "colab_type": "text"
      },
      "source": [
        "\n",
        "## (1) Marginal Equal Opportunity\n",
        "\n",
        "\n",
        "For a scoring model $f: \\mathbb{R}^d \\rightarrow \\mathbb{R}$, recall:\n",
        "- $sqerr(f)$ as the squared error for scoring function $f$.\n",
        "\n",
        "and we additionally define:\n",
        "\n",
        "- $err_i(f)$ as the row-marginal pairwise error over example pairs where the higher label example is from group $i$, and the lower label is from either groups\n",
        "\n",
        "$$\n",
        "err_i(f) = \\mathbf{E}\\big[\\mathbb{I}\\big(f(x) < f(x')\\big) \\,\\big|\\, y > y',~ grp(x) = i\\big]\n",
        "$$\n",
        "\n",
        "The constrained optimization problem we solve constraints the row-marginal pairwise errors to be similar:\n",
        "\n",
        "$$min_f\\;sqerr(f)$$\n",
        "\n",
        "$$\\text{s.t.   }\\;|err_0(f) - err_1(f)| \\leq 0.02$$\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1JsuylHbjrBX",
        "colab_type": "code",
        "outputId": "dd43cc66-1139-45c1-c941-37c696d7fe49",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 230
        }
      },
      "source": [
        "# Model hyper-parameters.\n",
        "model_params = {\n",
        "    'loops': 10, \n",
        "    'iterations_per_loop': 250, \n",
        "    'learning_rate': 0.1,\n",
        "    'constraint_type': 'marginal_equal_opportunity', \n",
        "    'constraint_slack': 0.02,\n",
        "    'dual_scale': 1.0}\n",
        "\n",
        "# Unconstrained optimization.\n",
        "model_params['constrained'] = False\n",
        "model_unc  = train_model(paired_train_set, model_params)\n",
        "display_results(model_unc, paired_train_set, model_params, 'Unconstrained     ', \n",
        "                'Train', show_header=True)\n",
        "display_results(model_unc, paired_test_set, model_params,  'Unconstrained     ', \n",
        "                'Test')\n",
        "\n",
        "# Constrained optimization with TFCO.\n",
        "model_params['constrained'] = True\n",
        "model_con  = train_model(paired_train_set, model_params)\n",
        "display_results(model_con, paired_train_set, model_params, 'Constrained     ', \n",
        "                'Train', show_header=True)\n",
        "display_results(model_con, paired_test_set, model_params, 'Constrained     ', \n",
        "                'Test')"
      ],
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\n",
            " Loop 9: error = 0.057, max constraint violation = 0.041\n",
            "\n",
            "Method\t\t\tError\t\tMSE\t\tGroup 0\t\tGroup 1\t\tDiff\n",
            "Unconstrained     \tTrain\t\t0.057\t\t0.496\t\t0.435\t\t0.061\n",
            "Unconstrained     \tTest\t\t0.054\t\t0.478\t\t0.443\t\t0.035\n",
            "\n",
            " Loop 9: error = 0.057, max constraint violation = -0.018\n",
            "\n",
            "Method\t\t\tError\t\tMSE\t\tGroup 0\t\tGroup 1\t\tDiff\n",
            "Constrained     \tTrain\t\t0.057\t\t0.483\t\t0.485\t\t0.002\n",
            "Constrained     \tTest\t\t0.054\t\t0.466\t\t0.486\t\t0.019\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "CorY2URkop1Y"
      },
      "source": [
        "## (2) Pairwise Equal Opportunity\n",
        "\n",
        "Recall that we denote\n",
        " $err_{i,j}(f)$ as the pairwise error over example pairs where the higher label example is from group $i$, and the lower label example is from group $j$.\n",
        "$$\n",
        "err_{i, j}(f) ~=~ \\mathbf{E}\\big[\\mathbb{I}\\big(f(x) < f(x')\\big) \\,\\big|\\, y > y',~ grp(x) = i, ~grp(x') = j\\big]\n",
        "$$\n",
        "\n",
        "\n",
        "We first constrain only the cross-group errors, highlighted below.\n",
        "\n",
        "<br>\n",
        "<table border='1' bordercolor='black'>\n",
        "  <tr >\n",
        "     <td bgcolor='white'> </td>\n",
        "     <td bgcolor='white'> </td>\n",
        "     <td bgcolor='white'  colspan=2 align=center><b>Negative</b></td>\n",
        "  </tr>\n",
        "  <tr>\n",
        "    <td bgcolor='white'></td>\n",
        "    <td bgcolor='white'></td>\n",
        "    <td>Group 0</td>\n",
        "    <td>Group 1</td>\n",
        "  </tr>\n",
        "  <tr>\n",
        "    <td bgcolor='white' rowspan=2><b>Positive</b></td>\n",
        "    <td bgcolor='white'>Group 0</td>\n",
        "    <td bgcolor='white'>$err_{0,0}$</td>\n",
        "    <td bgcolor='white'>$\\mathbf{err_{0,1}}$</td>\n",
        "  </tr>\n",
        "  <tr>\n",
        "    <td>Group 1</td>\n",
        "     <td bgcolor='white'>$\\mathbf{err_{1,0}}$</td>\n",
        "      <td bgcolor='white'>$err_{1,1}$</td>\n",
        "  </tr>\n",
        "</table>\n",
        "<br>\n",
        "\n",
        "The optimization problem we solve constraints the cross-group pairwise errors to be similar:\n",
        "\n",
        "$$min_f\\; sqerr(f)$$\n",
        "$$\\text{s.t. }\\;\\; |err_{0,1}(f) - err_{1,0}(f)| \\leq 0.02$$\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jI7EnjqhQiyY",
        "colab_type": "code",
        "outputId": "701287a3-1b47-4990-a499-9d84d666a089",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 230
        }
      },
      "source": [
        "# Model hyper-parameters.\n",
        "model_params = {\n",
        "    'loops': 10, \n",
        "    'iterations_per_loop': 250, \n",
        "    'learning_rate': 0.1,\n",
        "    'constraint_type': 'cross_group_equal_opportunity', \n",
        "    'constraint_slack': 0.02,\n",
        "    'dual_scale': 1.0}\n",
        "\n",
        "# Unconstrained optimization.\n",
        "model_params['constrained'] = False\n",
        "model_unc  = train_model(paired_train_set, model_params)\n",
        "display_results(model_unc, paired_train_set, model_params, 'Unconstrained     ', \n",
        "                'Train', show_header=True)\n",
        "display_results(model_unc, paired_test_set, model_params,  'Unconstrained     ', \n",
        "                'Test')\n",
        "\n",
        "# Constrained optimization with TFCO.\n",
        "model_params['constrained'] = True\n",
        "model_con  = train_model(paired_train_set, model_params)\n",
        "display_results(model_con, paired_train_set, model_params, 'Constrained     ', \n",
        "                'Train', show_header=True)\n",
        "display_results(model_con, paired_test_set, model_params, 'Constrained     ', \n",
        "                'Test')"
      ],
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\n",
            " Loop 9: error = 0.057, max constraint violation = 0.071\n",
            "\n",
            "Method\t\t\tError\t\tMSE\t\tGroup 0/1\tGroup 1/0\tDiff\n",
            "Unconstrained     \tTrain\t\t0.057\t\t0.529\t\t0.438\t\t0.091\n",
            "Unconstrained     \tTest\t\t0.054\t\t0.516\t\t0.446\t\t0.070\n",
            "\n",
            " Loop 9: error = 0.057, max constraint violation = 0.013\n",
            "\n",
            "Method\t\t\tError\t\tMSE\t\tGroup 0/1\tGroup 1/0\tDiff\n",
            "Constrained     \tTrain\t\t0.058\t\t0.484\t\t0.457\t\t0.027\n",
            "Constrained     \tTest\t\t0.055\t\t0.478\t\t0.476\t\t0.003\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MN82EPeEt-23",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}